Dependencies

library(dplyr)
library(ggplot2)
library(ggfortify)
library(Hmisc)
library(factoextra)
library(pca3d)
library(rgl)
library(Rtsne)
library(ggcorrplot)

Data info

[Data Sources] (https://search.r-project.org/CRAN/refmans/mlbench/html/PimaIndiansDiabetes.html)

Check the data structure

# import raw data
diabetes <- read.csv('diabetes.csv',header = T)
# replace 0 with NA
var_with_NAs <- c("Glucose","BloodPressure","SkinThickness","Insulin","BMI","Age")
for (i in var_with_NAs){
  diabetes[[i]][diabetes[[i]] == 0] <- NA
}
diabetes$Outcome[diabetes$Outcome == 0] <- 'healthy'
diabetes$Outcome[diabetes$Outcome == 1] <- 'diabetes'
# turn the outcome variable into factor
diabetes$Outcome <- as.factor(diabetes$Outcome)
# check the data structure
str(diabetes)
## 'data.frame':    768 obs. of  9 variables:
##  $ Pregnancies             : int  6 1 8 1 0 5 3 10 2 8 ...
##  $ Glucose                 : int  148 85 183 89 137 116 78 115 197 125 ...
##  $ BloodPressure           : int  72 66 64 66 40 74 50 NA 70 96 ...
##  $ SkinThickness           : int  35 29 NA 23 35 NA 32 NA 45 NA ...
##  $ Insulin                 : int  NA NA NA 94 168 NA 88 NA 543 NA ...
##  $ BMI                     : num  33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 NA ...
##  $ DiabetesPedigreeFunction: num  0.627 0.351 0.672 0.167 2.288 ...
##  $ Age                     : int  50 31 32 21 33 30 26 29 53 54 ...
##  $ Outcome                 : Factor w/ 2 levels "diabetes","healthy": 1 2 1 2 1 2 1 2 1 1 ...
# Remove NA rows
diabetes <-diabetes %>% na.omit()
dim(diabetes)
## [1] 392   9

After remove the NA values, the samples number reduced from 768 to 392

Data preview

knitr::kable(head(diabetes))
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
4 1 89 66 23 94 28.1 0.167 21 healthy
5 0 137 40 35 168 43.1 2.288 33 diabetes
7 3 78 50 32 88 31.0 0.248 26 diabetes
9 2 197 70 45 543 30.5 0.158 53 diabetes
14 1 189 60 23 846 30.1 0.398 59 diabetes
15 5 166 72 19 175 25.8 0.587 51 diabetes
diabetes_lm <- diabetes[, -9]
## subset based on diabetes or healthy samples
diabetes_sub <- diabetes_lm[diabetes$Outcome %in% "diabetes",]
healthy_sub <- diabetes_lm[diabetes$Outcome %in% "healthy",]

Data initial inspection

Scatter plot

pairs(~Pregnancies + Glucose + BloodPressure + SkinThickness + Insulin + BMI + DiabetesPedigreeFunction + Age,
      col = diabetes$Outcome, data = diabetes)

Histogram

par(mfrow=c(2,4))
for (i in colnames(diabetes_lm)){
  hist(diabetes_lm[[i]], main = i, xlab = NULL)
}

histogram <- function(x,y){
  p1 <- hist(x,col=rgb(0,0,1,1/4), main = i, xlab = NULL, xlim = range(x))
  p2 <- hist(y,col=rgb(1,0,0,1/4),ylim = range(y), add=T)
  legend ("topright", legend = c("healthy","diabetes"),
          fill = c(rgb(0,0,1,1/4),rgb(1,0,0,1/4)), cex = 0.8)
}

par(mfrow=c(2,4))
for (i in colnames(diabetes_lm)){
  histogram(healthy_sub[[i]], diabetes_sub[[i]])
}

We can see from the histogram that some variables have a high skewness. So we do a log2 transform for some variables.

diabetes_lm$Glucose <- log2(diabetes_lm$Glucose)
diabetes_lm$Insulin <- log2(diabetes_lm$Insulin)
diabetes_lm$BMI <- log2(diabetes_lm$BMI)
diabetes_lm$DiabetesPedigreeFunction <- log2(diabetes_lm$DiabetesPedigreeFunction)

par(mfrow=c(2,4))
for (i in colnames(diabetes_lm)){
  hist(diabetes_lm[[i]], main = i, xlab = NULL)
}

par(mfrow=c(2,4))
for (i in colnames(diabetes_lm)){
  histogram(healthy_sub[[i]], diabetes_sub[[i]])
}

Q-Q plot

par(mfrow=c(2,4))
for (i in colnames(diabetes_lm)){
  qqnorm(diabetes_lm[[i]], main = i);qqline(diabetes_lm[[i]], col = "red", lwd = 2)
}

Box plot

par(mfrow=c(2,4))
for (i in colnames(diabetes_lm)){
  boxplot(diabetes_lm[[i]] ~ diabetes[["Outcome"]], data = diabetes, main = i, xlab = NULL, ylab = NULL, col = c(rgb(1,0,0,1/4), rgb(0,0,1,1/4)))
}

Student t-test

t_test <- function (x,y) {
  var_test <- var.test(x, y, alternative = "two.sided")
  if (var_test$p.value < 0.05 & abs(var_test$estimate) < 1.1) {
     t.test(x, y, paired = FALSE, var.equal = TRUE, conf.level = 0.95)
  }
  else {
     t.test(x, y, paired = FALSE, var.equal = FALSE, conf.level = 0.95)
  }
}
t_test(diabetes_sub$Glucose, healthy_sub$Glucose)
## 
##  Welch Two Sample t-test
## 
## data:  x and y
## t = 11.151, df = 218.7, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  27.79385 39.72817
## sample estimates:
## mean of x mean of y 
##  145.1923  111.4313
t_test(diabetes_sub$BloodPressure, healthy_sub$BloodPressure)
## 
##  Welch Two Sample t-test
## 
## data:  x and y
## t = 3.761, df = 237.75, p-value = 0.0002131
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  2.432216 7.782699
## sample estimates:
## mean of x mean of y 
##  74.07692  68.96947
t_test(diabetes_sub$SkinThickness, healthy_sub$SkinThickness)
## 
##  Welch Two Sample t-test
## 
## data:  x and y
## t = 5.3693, df = 276.33, p-value = 1.677e-07
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  3.616261 7.802999
## sample estimates:
## mean of x mean of y 
##  32.96154  27.25191
t_test(diabetes_sub$Insulin, healthy_sub$Insulin)
## 
##  Welch Two Sample t-test
## 
## data:  x and y
## t = 5.7337, df = 207.88, p-value = 3.429e-08
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   49.86272 102.11966
## sample estimates:
## mean of x mean of y 
##  206.8462  130.8550
t_test(diabetes_sub$DiabetesPedigreeFunction, healthy_sub$DiabetesPedigreeFunction)
## 
##  Welch Two Sample t-test
## 
## data:  x and y
## t = 3.8245, df = 200.74, p-value = 0.0001749
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.07431668 0.23251667
## sample estimates:
## mean of x mean of y 
## 0.6255846 0.4721679
t_test(diabetes_sub$BMI, healthy_sub$BMI)
## 
##  Welch Two Sample t-test
## 
## data:  x and y
## t = 5.5571, df = 259.51, p-value = 6.797e-08
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  2.599983 5.453875
## sample estimates:
## mean of x mean of y 
##  35.77769  31.75076

Correlation analysis

diabetes_lm_1 <- diabetes_lm[, -c(1,8)]
cor_res_pea <- rcorr(as.matrix(diabetes_lm_1), type = "pearson")
knitr::kable(cor_res_pea$r)
Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction
Glucose 1.0000000 0.2151180 0.1999548 0.6345966 0.2209202 0.1050930
BloodPressure 0.2151180 1.0000000 0.2325712 0.1250890 0.2934836 -0.0273957
SkinThickness 0.1999548 0.2325712 1.0000000 0.1982221 0.6721587 0.1149568
Insulin 0.6345966 0.1250890 0.1982221 1.0000000 0.2773418 0.1047894
BMI 0.2209202 0.2934836 0.6721587 0.2773418 1.0000000 0.1234325
DiabetesPedigreeFunction 0.1050930 -0.0273957 0.1149568 0.1047894 0.1234325 1.0000000
ggcorrplot(cor_res_pea$r)

diabetes_cor <- read.csv('diabetes.csv',header = T)
cor_res_spe <- rcorr(as.matrix(diabetes_cor), type = "spearman")
knitr::kable(cor_res_spe$r)
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
Pregnancies 1.0000000 0.1307335 0.1851267 -0.0852223 -0.1267227 0.0001321 -0.0432415 0.6072163 0.1986887
Glucose 0.1307335 1.0000000 0.2351906 0.0600222 0.2132058 0.2311412 0.0912934 0.2850447 0.4757763
BloodPressure 0.1851267 0.2351906 1.0000000 0.1264859 -0.0067706 0.2928704 0.0300463 0.3508946 0.1429207
SkinThickness -0.0852223 0.0600222 0.1264859 1.0000000 0.5410001 0.4436145 0.1803905 -0.0667949 0.0897278
Insulin -0.1267227 0.2132058 -0.0067706 0.5410001 1.0000000 0.1927257 0.2211505 -0.1142129 0.0664716
BMI 0.0001321 0.2311412 0.2928704 0.4436145 0.1927257 1.0000000 0.1411920 0.1311859 0.3097067
DiabetesPedigreeFunction -0.0432415 0.0912934 0.0300463 0.1803905 0.2211505 0.1411920 1.0000000 0.0429086 0.1753535
Age 0.6072163 0.2850447 0.3508946 -0.0667949 -0.1142129 0.1311859 0.0429086 1.0000000 0.3090403
Outcome 0.1986887 0.4757763 0.1429207 0.0897278 0.0664716 0.3097067 0.1753535 0.3090403 1.0000000
ggcorrplot(cor_res_spe$r)

Simple linear regression

# function to make a pairwise linear regression between each variables
pairwise_lm <- function (dat) {
  n <- nrow(dat)
  p <- ncol(dat)
  LHS <- rep(colnames(dat), p)
  RHS <- rep(colnames(dat), each = p)
  ## function to fit and summarize a single model
  fitmodel <- function (LHS, RHS) {
    if (RHS == LHS) {
      z <- data.frame("LHS" = LHS, "RHS" = RHS,
                      "alpha" = 0,
                      "beta" = 1,
                      "beta.se" = 0,
                      "beta.tv" = Inf,
                      "beta.pv" = 0,
                      "sig" = 0,
                      "R2" = 1,
                      "F.fv" = Inf,
                      "F.pv" = 0,
                      stringsAsFactors = FALSE)
      } else {
      result <- summary(lm(reformulate(RHS, LHS), data = dat))
      z <- data.frame("LHS" = LHS, "RHS" = RHS,
                      "alpha" = result$coefficients[1, 1],
                      "beta" = result$coefficients[2, 1],
                      "beta.se" = result$coefficients[2, 2],
                      "beta.tv" = result$coefficients[2, 3],
                      "beta.pv" = result$coefficients[2, 4],
                      "sig" = result$sigma,
                      "R2" = result$r.squared,
                      "F.fv" = result$fstatistic[[1]],
                      "F.pv" = pf(result$fstatistic[[1]], 1, n - 2, lower.tail = FALSE),
                      stringsAsFactors = FALSE)
        }
      z
      }
  ## loop through all models
  do.call("rbind.data.frame", c(Map(fitmodel, LHS, RHS),
                                list(make.row.names = FALSE,
                                     stringsAsFactors = FALSE)))
  }
# perform pairwise linear regression
diabetes_lm_res <- pairwise_lm(diabetes_lm_1)
knitr::kable(diabetes_lm_res)
LHS RHS alpha beta beta.se beta.tv beta.pv sig R2 F.fv F.pv
Glucose Glucose 0.000000 1.0000000 0.0000000 Inf 0.0000000 0.0000000 1.0000000 Inf 0.0000000
BloodPressure Glucose 19.239382 7.4600096 1.7149113 4.3500849 0.0000174 12.2191693 0.0462758 18.9232389 0.0000174
SkinThickness Glucose -11.081240 5.8356382 1.4479842 4.0301808 0.0000670 10.3172477 0.0399819 16.2423573 0.0000670
Insulin Glucose -5.234593 1.7667161 0.1089503 16.2158024 0.0000000 0.7762978 0.4027128 262.9522459 0.0000000
BMI Glucose 3.726028 0.1871564 0.0418381 4.4733523 0.0000101 0.2981066 0.0488057 20.0108805 0.0000101
DiabetesPedigreeFunction Glucose -3.025995 0.2633538 0.1261891 2.0869767 0.0375391 0.8991290 0.0110445 4.3554719 0.0375391
Glucose BloodPressure 6.454936 0.0062032 0.0014260 4.3500849 0.0000174 0.3523539 0.0462758 18.9232389 0.0000174
BloodPressure BloodPressure 0.000000 1.0000000 0.0000000 Inf 0.0000000 0.0000000 1.0000000 Inf 0.0000000
SkinThickness BloodPressure 15.314729 0.1957266 0.0414464 4.7224043 0.0000033 10.2411614 0.0540894 22.3011019 0.0000033
Insulin BloodPressure 6.234255 0.0100421 0.0040332 2.4898660 0.0131948 0.9965799 0.0156473 6.1994327 0.0131948
BMI BloodPressure 4.509526 0.0071695 0.0011825 6.0628181 0.0000000 0.2921989 0.0861326 36.7577629 0.0000000
DiabetesPedigreeFunction BloodPressure -1.070738 -0.0019796 0.0036577 -0.5412239 0.5886622 0.9037964 0.0007505 0.2929234 0.5886622
Glucose SkinThickness 6.693588 0.0068513 0.0017000 4.0301808 0.0000670 0.3535147 0.0399819 16.2423573 0.0000670
BloodPressure SkinThickness 62.608884 0.2763516 0.0585193 4.7224043 0.0000033 12.1690123 0.0540894 22.3011019 0.0000033
SkinThickness SkinThickness 0.000000 1.0000000 0.0000000 Inf 0.0000000 0.0000000 1.0000000 Inf 0.0000000
Insulin SkinThickness 6.392759 0.0189088 0.0047345 3.9938208 0.0000777 0.9845380 0.0392920 15.9506046 0.0000777
BMI SkinThickness 4.447485 0.0195112 0.0010883 17.9280697 0.0000000 0.2263120 0.4517973 321.4156833 0.0000000
DiabetesPedigreeFunction SkinThickness -1.498309 0.0098706 0.0043191 2.2853659 0.0228274 0.8981418 0.0132151 5.2228974 0.0228274
Glucose Insulin 5.310459 0.2279443 0.0140569 16.2158024 0.0000000 0.2788427 0.4027128 262.9522459 0.0000000
BloodPressure Insulin 59.843601 1.5581618 0.6258015 2.4898660 0.0131948 12.4138260 0.0156473 6.1994327 0.0131948
SkinThickness Insulin 14.716264 2.0779705 0.5202964 3.9938208 0.0000777 10.3209545 0.0392920 15.9506046 0.0000777
Insulin Insulin 0.000000 1.0000000 0.0000000 Inf 0.0000000 0.0000000 1.0000000 Inf 0.0000000
BMI Insulin 4.430122 0.0843948 0.0148043 5.7006916 0.0000000 0.2936683 0.0769185 32.4978850 0.0000000
DiabetesPedigreeFunction Insulin -1.865587 0.0943223 0.0453280 2.0808812 0.0380968 0.8991580 0.0109808 4.3300667 0.0380968
Glucose BMI 5.585187 0.2607751 0.0582952 4.4733523 0.0000101 0.3518863 0.0488057 20.0108805 0.0000101
BloodPressure BMI 10.400730 12.0137079 1.9815386 6.0628181 0.0000000 11.9611204 0.0861326 36.7577629 0.0000000
SkinThickness BMI -87.007283 23.1557548 1.2915922 17.9280697 0.0000000 7.7964113 0.4517973 321.4156833 0.0000000
Insulin BMI 2.372084 0.9114126 0.1598775 5.7006916 0.0000000 0.9650655 0.0769185 32.4978850 0.0000000
BMI BMI 0.000000 1.0000000 0.0000000 Inf 0.0000000 0.0000000 1.0000000 Inf 0.0000000
DiabetesPedigreeFunction BMI -3.042082 0.3651121 0.1486382 2.4563813 0.0144694 0.8972218 0.0152356 6.0338091 0.0144694
Glucose DiabetesPedigreeFunction 6.944044 0.0419380 0.0200951 2.0869767 0.0375391 0.3588030 0.0110445 4.3554719 0.0375391
BloodPressure DiabetesPedigreeFunction 70.204291 -0.3791214 0.7004889 -0.5412239 0.5886622 12.5074058 0.0007505 0.2929234 0.5886622
SkinThickness DiabetesPedigreeFunction 30.766229 1.3388293 0.5858271 2.2853659 0.0228274 10.4600898 0.0132151 5.2228974 0.0228274
Insulin DiabetesPedigreeFunction 7.084803 0.1164181 0.0559465 2.0808812 0.0380968 0.9989393 0.0109808 4.3300667 0.0380968
BMI DiabetesPedigreeFunction 5.066666 0.0417285 0.0169878 2.4563813 0.0144694 0.3033215 0.0152356 6.0338091 0.0144694
DiabetesPedigreeFunction DiabetesPedigreeFunction 0.000000 1.0000000 0.0000000 Inf 0.0000000 0.0000000 1.0000000 Inf 0.0000000

Exploratory data analysis

Principal component analysis

pca_res <- prcomp(diabetes_lm,scale. = TRUE)
autoplot(pca_res, data = diabetes, colour = 'Outcome',
         frame = TRUE, frame.type = 'norm', shape = FALSE, label.size = 3)

## remove outliers
rows_remove <- c("446","121","178","618","255","520","29","460","461")
diabetes_lm_clean <- diabetes_lm[!(row.names(diabetes_lm) %in% rows_remove),]
diabetes_clean <- diabetes[!(row.names(diabetes) %in% rows_remove),]

## perform PCA again
pca_res <- prcomp(diabetes_lm_clean,scale. = TRUE)
autoplot(pca_res, data = diabetes_clean, colour = 'Outcome',
         frame = TRUE, frame.type = 'norm', shape = FALSE, label.size = 3)

autoplot(pca_res, data = diabetes_clean, colour = 'Outcome',
         loadings = TRUE,loadings.colour = 'blue',
         loadings.label = TRUE, loadings.label.size = 3)

## check scree plot and Q-Q plot
pca_res %>% fviz_eig()

pca_res %>% get_eig()
##       eigenvalue variance.percent cumulative.variance.percent
## Dim.1  2.7257637        34.072047                    34.07205
## Dim.2  1.3862286        17.327858                    51.39990
## Dim.3  1.2008741        15.010927                    66.41083
## Dim.4  0.9870816        12.338521                    78.74935
## Dim.5  0.7478677         9.348346                    88.09770
## Dim.6  0.3448684         4.310855                    92.40855
## Dim.7  0.3054505         3.818131                    96.22668
## Dim.8  0.3018653         3.773316                   100.00000
pca_res %>% fviz_eig(choice = "eigenvalue",
             geom = "line",
             addlabels = T) +
             scale_y_continuous(limits = c(0, 5))

pca_res[["x"]][,1] %>% qqnorm()

# ellipses
pca3d(pca_res, group=diabetes_clean$Outcome, show.ellipses=TRUE, ellipse.ci=0.75,      show.plane=FALSE)
## [1] 0.08271360 0.05859502 0.07373144
## Creating new device
# loading
pca3d(pca_res, group=diabetes_clean$Outcome, show.plane=TRUE , biplot = TRUE, biplot.vars = 8)
## [1] 0.08271360 0.05859502 0.07373144

3D PCA plot

3D PCA biplot

Maximum-likelihood factor analysis

factor_res <- factanal(diabetes_lm, factors = 3, scores = 'regression')
autoplot(factor_res, data = diabetes, colour = 'Outcome',loadings = TRUE, loadings.label = TRUE, loadings.label.size  = 3)

t-SNE analysis

tsne_out <- Rtsne(diabetes_lm, perplexity = 20)
pdat <- data.frame(tsne_out$Y, diabetes$Outcome)
colnames(pdat) = c("Y1", "Y2", "group")
ggplot(pdat,aes(Y1,Y2))+
  geom_point(aes(Y1,Y2, fill = group),shape = 21, color = "black") +
             stat_ellipse(aes(color = group, fill = group),
                          geom = "polygon",
                          alpha = 0.3,
                          linetype = 2)+
             theme_classic() +
             theme(legend.position = "top")